using Dates
using MultivariateStats
using Plots
using NCDatasets
using StatsBase
using Unitful
using DataFrames
using Statistics
Plots.default(; margin=4Plots.mm, size=(700, 400), linewidth=2)Project 1
Bias Correction: Quantile Quantile Mapping
Module 2
Project
1 Setup
2 Precipitation Data: ERA5 Reanalysis Dataset
# Dataset covers daily precipitation (mm) from 1/1/1979 to 12/31/2020
ds_precip = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc")
println(ds_precip)Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc
Group: /
Dimensions
lon = 10
lat = 10
time = 15341
Variables
lon (10)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lon
Attributes:
_FillValue = NaN
long_name = Longitude
units = degrees_east
axis = X
standard_name = longitude
actual_range = Float32[0.25, 359.75]
coordinate_defines = center
lat (10)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lat
Attributes:
_FillValue = NaN
actual_range = Float32[89.75, -89.75]
long_name = Latitude
units = degrees_north
axis = Y
standard_name = latitude
coordinate_defines = center
time (15341)
Datatype: Union{Missing, DateTime} (Float64)
Dimensions: time
Attributes:
_FillValue = NaN
long_name = Time
axis = T
standard_name = time
coordinate_defines = start
actual_range = [692496.0, 701232.0]
delta_t = 0000-00-01 00:00:00
avg_period = 0000-00-01 00:00:00
_ChunkSizes = 1
units = hours since 1900-01-01
calendar = proleptic_gregorian
precip (10 × 10 × 15341)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lon × lat × time
Attributes:
_FillValue = -9.96921e36
var_desc = Precipitation
level_desc = Surface
statistic = Total
parent_stat = Other
long_name = Daily total of precipitation
cell_methods = time: sum
avg_period = 0000-00-01 00:00:00
actual_range = Float32[0.0, 428.02423]
units = mm
valid_range = Float32[0.0, 1000.0]
dataset = CPC Global Precipitation
_ChunkSizes = Int32[1, 360, 720]
missing_value = -9.96921e36
precip_time = ds_precip["time"][:];
precip_lon = ds_precip["lon"][:];
precip_lat = ds_precip["lat"][:];
precip = ds_precip["precip"][:,:,:];precip = precip .* 1u"mm";precip_lat = reverse(precip_lat)
precip = reverse(precip, dims=2);2.1 Precipitation Heatmap: Past vs Present
h1 = heatmap(
precip_lon,
precip_lat,
precip[:, :, 1]';
xlabel="Longitude",
ylabel="Latitude",
title="Precipitation on 1/1/1979"
)
h2 = heatmap(
precip_lon,
precip_lat,
precip[:, :, end]';
xlabel="Longitude",
ylabel="Latitude",
title="Precipitation on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))close(ds_precip)closed Dataset
3 Temperature Data: ERA5 Reanalysis Dataset
# Dataset covers daily temperature (Kelvin) from 1/1/1979 to 12/31/2020
ds_temp = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc")
println(ds_temp)Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc
Group: /
Dimensions
longitude = 21
latitude = 33
time = 15341
Variables
longitude (21)
Datatype: Union{Missing, Float64} (Float64)
Dimensions: longitude
Attributes:
_FillValue = NaN
latitude (33)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: latitude
Attributes:
_FillValue = NaN
units = degrees_north
long_name = latitude
time (15341)
Datatype: DateTime (Int64)
Dimensions: time
Attributes:
units = days since 1979-01-01 00:00:00
calendar = proleptic_gregorian
t2m (21 × 33 × 15341)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: longitude × latitude × time
Attributes:
_FillValue = NaN
units = K
long_name = 2 metre temperature
temp_time = ds_temp["time"][:];
temp_lon = ds_temp["longitude"][:];
temp_lat = ds_temp["latitude"][:];
temp = ds_temp["t2m"][:,:,:];temp_lat = reverse(temp_lat)
temp = reverse(temp, dims=2);3.1 Temperature Heatmap: Past vs Present
h1 = heatmap(
temp_lon,
temp_lat,
temp[:, :, 1]';
xlabel="Longitude",
ylabel="Latitude",
title="Temperature on 1/1/1979"
)
h2 = heatmap(
temp_lon,
temp_lat,
temp[:, :, end]';
xlabel="Longitude",
ylabel="Latitude",
title="Temperature on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))3.2 Ensuring temperature and precipitation correspond to the same time period
@assert temp_time == precip_timetime_data = Dates.Date.(temp_time)15341-element Vector{Date}:
1979-01-01
1979-01-02
1979-01-03
1979-01-04
1979-01-05
1979-01-06
1979-01-07
1979-01-08
1979-01-09
1979-01-10
1979-01-11
1979-01-12
1979-01-13
⋮
2020-12-20
2020-12-21
2020-12-22
2020-12-23
2020-12-24
2020-12-25
2020-12-26
2020-12-27
2020-12-28
2020-12-29
2020-12-30
2020-12-31
4 Splitting Data: Training Period vs Testing Period
# Performing a typical split ratio (roughly 70:30) for training and testing data collected from 1979 to 2021. With 42 years of data, the training dataset will encompass 30 years and the testing period will be 12 years (2009 to 2021)
split_idx = findfirst(time_data .== time_data[end] - Dates.Year(10))
train_idx = 1:split_idx
test_idx = (split_idx+1):length(time_data);4.1 Testing and Training Period Variables
precip_train = precip[:, :, train_idx];
precip_test = precip[:, :, test_idx];
temp_train = temp[:, :, train_idx];
temp_test = temp[:, :, test_idx];5 Preprocessing Data: Climatology and Mean Centering
function preprocess(temp::Array{T, 3}, temp_reference::Array{T, 3})::AbstractMatrix where T
#Computing anomalies that removes climatology from our matrix to allow greater clarity of temperature variations by removing the influence of climate patterns
climatology = mean(temp_reference, dims=3)
anomalies = temp .- climatology
#Reshaping our temperature dataset to produce a 2D matrix of time (rows) and locations (columns):
temp_mat = reshape(anomalies, size(temp, 1) * size(temp, 2), size(temp, 3))
return temp_mat
endpreprocess (generic function with 1 method)
5.1 Apply to Training and Testing Datasets
#Preprocessing temp_train and temp_test; both are preprocessed to the temp training data so they both use the same climatology
n_lon, n_lat, n_t = size(temp)
temp_training_matrix = preprocess(temp_train, temp_train);
temp_testing_matrix = preprocess(temp_test, temp_train);6 Principle Component Analysis
6.1 Fitting
#Fitting a PCA model to training period to automatically choose the number of principle components
PCA_model = fit(PCA, temp_training_matrix; maxoutdim=10, pratio=0.999);6.2 Plotting Variance Explained by PCs
# Variables for plotting the variance accounted by the principle components
variance_explained = principalvars(PCA_model)
total_var = var(PCA_model)
cumulative_var = cumsum(variance_explained)./total_var10-element Vector{Float32}:
0.9599857
0.98082733
0.98715866
0.9929077
0.99447197
0.9957567
0.99657375
0.9971035
0.9975621
0.99789464
p1 = plot(
variance_explained / total_var;
xlabel="Number of PCs",
ylabel="Fraction of Variance Explained",
label=false,
title="Variance Explained"
)
p2 = plot(
cumulative_var;
xlabel="Number of PCs",
ylabel="Fraction of Variance Explained",
label=false,
title="Cumulative Variance Explained"
)
plot(p1, p2; layout=(1, 2), size=(900, 400))
Note
I chose to select 4 principle components because, after plotting the variance explained, the 4 principle components accounted for a cumulative variance of 0.95 and retains enough information while also reducing the noise of outiers.
pc = projection(PCA_model)[:, 3];6.3 Plotting Spatial Patterns
p = []
for i in 1:4
pc = projection(PCA_model)[:, i]
pc_reshaped = reshape(pc, n_lon, n_lat)'
pi = heatmap(
temp_lon,
temp_lat,
pc_reshaped;
xlabel="Longitude",
ylabel="Latitude",
title="PC $i",
aspect_ratio=:equal,
cmap=:PuOr
)
push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1500, 600))6.4 Plotting Time Series
pc_ts = predict(PCA_model, temp_training_matrix)
Months = Dates.month.(time_data)
custom_xticks = [1, 3, 6, 9, 12]
p = []
for i in 1:4
pi = scatter(
Months,
pc_ts[i, :];
xlabel="Months in Year",
ylabel="PC $i",
title="PC $i",
label=false,
alpha=0.3,
color=:blue,
xticks=(custom_xticks, custom_xticks)
)
push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1800, 600))7 Scatter Plots: Comparison of Influence on Rainfall
avg_precip =
ustrip.(
u"inch", [mean(skipmissing(precip_train[:, :, t])) for t in 1:size(precip_train, 3)]
)
avg_precip = replace(avg_precip, NaN => 0)
p1_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p1 = scatter(
pc_ts[2, p1_idx],
pc_ts[3, p1_idx];
zcolor=avg_precip[p1_idx],
xlabel="PC 2",
ylabel="PC 3",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
p2_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p2 = scatter(
pc_ts[4, p2_idx],
pc_ts[3, p2_idx];
zcolor=avg_precip[p2_idx],
xlabel="PC 4",
ylabel="PC 3",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
p3_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p3 = scatter(
pc_ts[4, p3_idx],
pc_ts[2, p3_idx];
zcolor=avg_precip[p3_idx],
xlabel="PC 4",
ylabel="PC 2",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
plot(p1,p2, p3; size=(1000, 600), link=:both)8 K-NN: Resampling Algorithm
function euclidean_distance(x::AbstractVector, y::AbstractVector)::AbstractFloat
return sqrt(sum((x .- y) .^ 2))
end
function nsmallest(x::AbstractVector, n::Int)::Vector{Int}
idx = sortperm(x)
return idx[1:n]
end
function knn(X::AbstractMatrix, X_i::AbstractVector, K::Int)::Tuple{Int,AbstractVector}
# calculate the distances between X_i and each row of X
dist = [euclidean_distance(X_i, X[j, :]) for j in 1:size(X, 1)]
idx = nsmallest(dist, K)
w = 1 ./ dist[idx]
w ./= sum(w)
idx_sample = sample(idx, Weights(w))
return (idx_sample, vec(X[idx_sample, :]))
endknn (generic function with 1 method)
8.1 Combining KNN and PCA
"""
KNN resampling algorithm
temp_train: the training temperature data. This should be the raw field, not the prinipal components. Inside the function, convert to principal components using `n_pca` principal components.
temp_test: the temperature data to predict. This should be the raw field, not the prinipal components. Inside the function, convert to principal components using `n_pca` principal components.
precip_train: the training precipitation data.
"""
function predict_knn(temp_train, temp_test, precip_train; n_pca::Int)
X_train = preprocess(temp_train, temp_train)
X_test = preprocess(temp_test, temp_train)
# fit the PCA model to the training data
pca_model = fit(PCA, X_train; maxoutdim=n_pca)
# project the test data onto the PCA basis
train_embedded = predict(pca_model, X_train)
test_embedded = predict(pca_model, X_test)
# use the `knn` function for each point in the test data
precip_pred = map(1:size(X_test, 2)) do i
idx, _ = knn(train_embedded', test_embedded[:, i], 3)
precip_train[:, :, idx]
end
# return a matrix of predictions
return precip_pred
endpredict_knn
8.2 Predicted vs. Actual Values
t_sample = rand(1:size(temp_test, 3), 4)
precip_pred = predict_knn(temp_train, temp_test[:, :, t_sample], precip_train; n_pca=4)
p = map(eachindex(t_sample)) do ti
t = t_sample[ti]
y_pred = precip_pred[ti]'
y_actual = precip_test[:, :, t]'
cmax = max(maximum(skipmissing(y_pred)), maximum(skipmissing(y_actual)))
cmax = ustrip(u"mm", cmax)
p1 = heatmap(
precip_lon,
precip_lat,
y_pred;
xlabel="Longitude",
ylabel="Latitude",
title="Predicted",
aspect_ratio=:equal,
clims=(0, cmax)
)
p2 = heatmap(
precip_lon,
precip_lat,
y_actual;
xlabel="Longitude",
ylabel="Latitude",
title="Actual",
aspect_ratio=:equal,
clims=(0, cmax)
)
plot(p1, p2; layout=(2, 1), size=(1000, 400))
end
plot(p...; layout=(2, 3), size=(1500, 1200))